Packages

library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1     v purrr   0.2.4
## v tibble  1.4.2     v dplyr   0.7.4
## v tidyr   0.8.0     v stringr 1.2.0
## v readr   1.1.1     v forcats 0.2.0
## -- Conflicts ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(knitr)

Load the Data

data <- read_csv("C:/Users/craig/source/repos/ElectricMeterClustering/2018.01.csv")
## Parsed with column specification:
## cols(
##   h.ReadDate = col_datetime(format = ""),
##   i.ReadDate = col_datetime(format = ""),
##   Readvalue = col_double(),
##   Uom = col_character(),
##   ReadQualityCode = col_integer(),
##   RecordInterval = col_integer(),
##   VeeFlag = col_character(),
##   ChannelStatus = col_character(),
##   SubstationName = col_character(),
##   MeterIdentifier = col_integer(),
##   LocationNumber = col_integer()
## )

Functions

# fxnInterval_from_datetime numbers the date with a interger as ending intervals
fxnInterval_from_datetime <- function(dt) {
  
  #TODO: make this a parameter
  divisor = 15
  interval = 96
  h = hour(dt)
  m = minute(dt)
  interval = ( (h * 60) / divisor ) + ( m / divisor )

  if (interval == 0) {
    interval = 96
  } 
  
  return(interval)
}

# TODO: Would be nice to have something to return HHMM and make factors
# This will be easy to read for users and if we factor them or index it 
# will plot correctly.  Might run faster too.
fxn_hour_minute <- function(dt) {
   dt <- format(as.POSIXct(dt, 
                           format="%Y-%m-%d %H:%M"),
                           format="%H:%M")
   
  return (dt)
}
fxn_histogram_plot <- function(df,LocationNumber) {
  
  p1 <- ggplot(df,aes(x = Readvalue)) +
    geom_histogram(binwidth=0.05, color = 'black', fill = '#333333') +
    ggtitle(paste("Histogram of kWh for Location ",LocationNumber, " (bin=0.05)"))


  p2 <- ggplot(df,aes(x = Readvalue)) +
     geom_histogram(binwidth=0.05, color = 'black', fill = '#333333') +
     scale_x_sqrt() +
     xlab("sqrt(ReadValue)") +
     ggtitle(paste("Histogram of kWh for Location (bin=0.05)",LocationNumber, ""))
  
  p3 <- ggplot(df,aes(x = Readvalue)) +
    geom_histogram(binwidth=0.05, color = 'black', fill = '#333333') +
    scale_x_log10() +
    xlab("scale_x_log10") +
    ggtitle(paste("Histogram of kWh for Location (bin=0.05)",LocationNumber, ""))
    
    grid.arrange(p1, p2, p3, ncol=1)
}


fxn_daily_scatter_plot <- function(df,LocationNumber) {
  
  ggplot(df, aes(x=hhmm, y=Readvalue, group=h.ReadDate)) +
    geom_line(alpha = 1/2) +
    xlab("Time") +
    ylab("kWh") +
    ggtitle(paste("Load Shapes | January 2018 | group = h.ReadDate | ", LocationNumber, ""))
}  
  
fxn_dow_scatter_plot <- function(df,LocationNumber) {
  
  plt <- ggplot(df, aes(x=hhmm, y=Readvalue, group=h.ReadDate)) +
    geom_line(alpha = 1/2) +
    facet_wrap(~weekday, ncol=7) +
    xlab("Time") +
    ylab("kWh") +
    ggtitle(paste("Load Shapes | January 2018 | group = h.ReadDate | ", LocationNumber, ""))
  
  return(plt)
}

fxn_day_scatter_plot <- function(df,LocationNumber) {
  
  plt <- ggplot(df, aes(x=hhmm, y=Readvalue, group=h.ReadDate)) +
    geom_line(alpha = 1/2) +
    facet_wrap(h.ReadDate ~ weekday, ncol=7) +
    xlab("Time") +
    ylab("kWh") +
    ggtitle(paste("Load Shapes | January 2018 | group = h.ReadDate | ", LocationNumber, ""))
  
  return(plt)
}

fxn_scaled_plot <- function(locationNumber) {
  
  p1 <- ggplot(subset(data_summary,data_summary$LocationNumber == locationNumber), aes(x=hhmm, y=mean)) +
    geom_point() +
    xlab("Time") +
    ylab("mean") +
    theme(axis.text.x = element_text(angle = -90, hjust = 0)) +
    #scale_x_date(breaks = data_summary$hhmm[seq(1, length(data_summary$hhmm), by = 4)]) +
    ggtitle(paste("Load Shapes | January 2018 | ",locationNumber, ""))
  
  p2 <- ggplot(subset(data_summary,data_summary$LocationNumber == locationNumber), aes(x=hhmm, y=scaled_mean)) +
    geom_point() +
    xlab("Time") +
    ylab("scaled_mean") +
    theme(axis.text.x = element_text(angle = -90, hjust = 0))  +
    ggtitle(paste("Load Shapes | January 2018 | ",locationNumber, ""))

  grid.arrange(p1, p2, ncol=2)
}

# TODO FIX NAME
fxn_scaled_plot_facet_wrap <- function(locationNumber) {
  
  p1 <-  ggplot(subset(data_summary,data_summary$LocationNumber %in% locationNumber), aes(x=hhmm, y=mean,group=LocationNumber, color = LocationNumber)) +
    geom_line() +
    xlab("Time") +
    ylab("mean") +
    theme(axis.text.x = element_text(angle = -90, hjust = 0))  +
    ggtitle(paste("Load Shapes | January 2018"))
  
  p2 <- ggplot(subset(data_summary,data_summary$LocationNumber %in% locationNumber), aes(x=hhmm, y=scaled_mean,group=LocationNumber, color = LocationNumber)) +
    geom_line() +
    xlab("Time") +
    ylab("scaled_mean") +
    theme(axis.text.x = element_text(angle = -90, hjust = 0))  +
    ggtitle(paste("Load Shapes | January 2018"))
  
    grid.arrange(p1, p2, ncol=2)
}
# test$i.ReadDate
# 
# for (i in 1:1){
#   interval <- fxnInterval_from_datetime(dt)
#   hhmm <- fxn_hour_minute(dt)
#   print(sprintf("%s is time: %s %i", dt, hhmm, interval)) 
#   #print(sprintf("%s",hhmm)) 
#   dt <- dt + minutes(15)
# }

Data Review

Customer Wants us to find similar accounts from the same rate codes as the profiles provided. - January - 2018 - kWh

str(data)
## Classes 'tbl_df', 'tbl' and 'data.frame':    40135207 obs. of  11 variables:
##  $ h.ReadDate     : POSIXct, format: "2018-01-01" "2018-01-01" ...
##  $ i.ReadDate     : POSIXct, format: "2018-01-01 00:15:00" "2018-01-01 00:30:00" ...
##  $ Readvalue      : num  2.34 2.27 2.27 2.5 2.29 ...
##  $ Uom            : chr  "KWH" "KWH" "KWH" "KWH" ...
##  $ ReadQualityCode: int  10 3 3 3 3 3 3 3 3 3 ...
##  $ RecordInterval : int  900 900 900 900 900 900 900 900 900 900 ...
##  $ VeeFlag        : chr  "False" "False" "False" "False" ...
##  $ ChannelStatus  : chr  NA NA NA NA ...
##  $ SubstationName : chr  "WARTRACE" "WARTRACE" "WARTRACE" "WARTRACE" ...
##  $ MeterIdentifier: int  300029 300029 300029 300029 300029 300029 300029 300029 300029 300029 ...
##  $ LocationNumber : int  3140300 3140300 3140300 3140300 3140300 3140300 3140300 3140300 3140300 3140300 ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 11
##   .. ..$ h.ReadDate     :List of 1
##   .. .. ..$ format: chr ""
##   .. .. ..- attr(*, "class")= chr  "collector_datetime" "collector"
##   .. ..$ i.ReadDate     :List of 1
##   .. .. ..$ format: chr ""
##   .. .. ..- attr(*, "class")= chr  "collector_datetime" "collector"
##   .. ..$ Readvalue      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ Uom            : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ ReadQualityCode: list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ RecordInterval : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ VeeFlag        : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ ChannelStatus  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ SubstationName : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ MeterIdentifier: list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ LocationNumber : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"
## # A tibble: 6 x 11
##   h.ReadDate          i.ReadDate          Readvalue Uom   ReadQualityCode
##   <dttm>              <dttm>                  <dbl> <chr>           <int>
## 1 2018-01-01 00:00:00 2018-01-01 00:15:00      2.34 KWH                10
## 2 2018-01-01 00:00:00 2018-01-01 00:30:00      2.27 KWH                 3
## 3 2018-01-01 00:00:00 2018-01-01 00:45:00      2.27 KWH                 3
## 4 2018-01-01 00:00:00 2018-01-01 01:00:00      2.50 KWH                 3
## 5 2018-01-01 00:00:00 2018-01-01 01:15:00      2.29 KWH                 3
## 6 2018-01-01 00:00:00 2018-01-01 01:30:00      2.31 KWH                 3
## # ... with 6 more variables: RecordInterval <int>, VeeFlag <chr>,
## #   ChannelStatus <chr>, SubstationName <chr>, MeterIdentifier <int>,
## #   LocationNumber <int>
##    h.ReadDate                    i.ReadDate                 
##  Min.   :2018-01-01 00:00:00   Min.   :2018-01-01 00:00:00  
##  1st Qu.:2018-01-08 00:00:00   1st Qu.:2018-01-08 19:30:00  
##  Median :2018-01-16 00:00:00   Median :2018-01-16 14:00:00  
##  Mean   :2018-01-16 01:20:56   Mean   :2018-01-16 13:20:32  
##  3rd Qu.:2018-01-24 00:00:00   3rd Qu.:2018-01-24 07:30:00  
##  Max.   :2018-01-31 00:00:00   Max.   :2018-02-01 00:00:00  
##    Readvalue            Uom            ReadQualityCode  RecordInterval 
##  Min.   : -0.0375   Length:40135207    Min.   :-1.000   Min.   :300.0  
##  1st Qu.:  0.0950   Class :character   1st Qu.: 3.000   1st Qu.:900.0  
##  Median :  0.3845   Mode  :character   Median : 3.000   Median :900.0  
##  Mean   :  0.7188                      Mean   : 2.973   Mean   :899.9  
##  3rd Qu.:  0.9858                      3rd Qu.: 3.000   3rd Qu.:900.0  
##  Max.   :263.9831                      Max.   :10.000   Max.   :900.0  
##    VeeFlag          ChannelStatus      SubstationName     MeterIdentifier 
##  Length:40135207    Length:40135207    Length:40135207    Min.   : 36920  
##  Class :character   Class :character   Class :character   1st Qu.:301064  
##  Mode  :character   Mode  :character   Mode  :character   Median :401426  
##                                                           Mean   :392836  
##                                                           3rd Qu.:405627  
##                                                           Max.   :905071  
##  LocationNumber   
##  Min.   :  10260  
##  1st Qu.: 480140  
##  Median :2110910  
##  Mean   :2809667  
##  3rd Qu.:4243280  
##  Max.   :9290400

Data Wrangling

Date Formatting and Creating Grouping Keys

#data$dtReadDate <- parse_date_time(data$i.ReadDate, orders="ymd HMS")
#data$dtReadDay <- parse_date_time(data$h.ReadDate, orders="ymd HMS")
data$month <- month(data$h.ReadDate)
data$week <- week(data$h.ReadDate)
data$weekday  <- wday(data$h.ReadDate, label = TRUE)
data$h <- hour(data$i.ReadDate)            # this needs to be rolled back by 15min to be correct
data$hhmm <- fxn_hour_minute(data$i.ReadDate)

# What about a grouping flag for weekday vs weekend

Factoring the WeekDays, Weekends, and HHMM for ending interval

data$MeterIdentifier <- factor(data$MeterIdentifier,ordered = TRUE)
data$LocationNumber <- factor(data$LocationNumber,ordered = TRUE)
weekday_levels <- c('Mon', 'Tue', 'Wed', 'Thu', 'Fri')
data$isWeekDay <- factor((weekdays(data$h.ReadDate, abbreviate = TRUE ) 
                          %in% weekday_levels), 
                          levels=c(FALSE, TRUE), labels=c('Weekend', 'Weekday')) 

#day_of_week_levels <- c('Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat')
#min(test$weekday)
#[1] Mon
#Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
hhmm_levels <- c("00:15","00:30","00:45","01:00",
                      "01:15","01:30","01:45","02:00",
                      "02:15","02:30","02:45","03:00",
                      "03:15","03:30","03:45","04:00",
                      "04:15","04:30","04:45","05:00",
                      "05:15","05:30","05:45","06:00",
                      "06:15","06:30","06:45","07:00",
                      "07:15","07:30","07:45","08:00",
                      "08:15","08:30","08:45","09:00",
                      "09:15","09:30","09:45","10:00",
                      "10:15","10:30","10:45","11:00",
                      "11:15","11:30","11:45","12:00",
                      "12:15","12:30","12:45","13:00",
                      "13:15","13:30","13:45","14:00",
                      "14:15","14:30","14:45","15:00",
                      "15:15","15:30","15:45","16:00",
                      "16:15","16:30","16:45","17:00",
                      "17:15","17:30","17:45","18:00",
                      "18:15","18:30","18:45","19:00",
                      "19:15","19:30","19:45","20:00",
                      "20:15","20:30","20:45","21:00",
                      "21:15","21:30","21:45","22:00",
                      "22:15","22:30","22:45","23:00",
                      "23:15","23:30","23:45","00:00")

data$hhmm <- factor(data$hhmm, levels = hhmm_levels,ordered = TRUE)

Calculate the max ReadValue for each Location for scaling

# We can add more group by's if needed
#    group_by(LocationNumber, h.ReadDate) %>%
data <-
  data %>%
   group_by(LocationNumber) %>%
   mutate(max.per.group = max(as.numeric(Readvalue)))

Scale the Readvalue

Allows us to compare locations with out scaling issues

# Fingers cross this gives us like values between 0 and 1 for all meters
data$scale_by_monthmax <- data$Readvalue / data$max.per.group
review <- subset(data, data$h.ReadDate < as.POSIXct("2018-01-01") & data$LocationNumber == 4191826)
#data$interval <- fxnInterval_from_datetime(data$dtReadDate)

Calculate the data summary

# Should dtReadDay, h, are anyother field be usable here?  weekday at least since Sat and Sun are not diff load shapes
data_summary <- data %>%
  group_by(hhmm, LocationNumber, MeterIdentifier, Uom)  %>%
  summarise(mean = mean(Readvalue),
            median = median(as.numeric(Readvalue)),
            min = min(Readvalue),
            max = max(Readvalue),
            total = sum(Readvalue),
            std = sd(Readvalue), 
            scaled_mean = mean(scale_by_monthmax),
            n = n())  %>% 
  arrange(hhmm, LocationNumber, MeterIdentifier, Uom) 

Calculate the data totals

To be used for removing dirty data.

# Should dtReadDay, h, are anyother field be usable here?  weekday at least since Sat and Sun are not diff load shapes
data_total <- data_summary %>%
  group_by(LocationNumber, MeterIdentifier)  %>%
  summarise(intervals = sum(n),
            expected = as.numeric(2976))  %>% 
  arrange(LocationNumber, MeterIdentifier) 

Sample the Data an Remove Bad Locations from Sample

set.seed(8675)
sample.locations <- sample(data$LocationNumber,1000)

length(sample.locations)
## [1] 1000
# get the unique locations
x <- unique(data$LocationNumber)
locationNumber <- data.frame(x)

summary(locationNumber)
##        x        
##  10260  :    1  
##  10280  :    1  
##  10550  :    1  
##  10640  :    1  
##  10645  :    1  
##  10660  :    1  
##  (Other):13700
sample <- locationNumber[sample(nrow(locationNumber), 5), ]
sample <- c(sample, '4191826')
length(sample)
## [1] 6
length(unique(sample))
## [1] 6
# Get the data just for the unique locations
sampleData <- subset(data, data$LocationNumber %in% sample)

Count of the Locations with Good Data

count(subset(data_total, data_total < 2976))
## Warning: Length of logical index must be 1 or 13728, not 54912
## # A tibble: 1 x 2
## # Groups:   LocationNumber [1]
##   LocationNumber     n
##   <ord>          <int>
## 1 <NA>            2399

Scale the Summary Values

data_summary$scaled_value_max <- data_summary$mean/max(data_summary$mean)
data_summary$scaled_value_sd <- data_summary$mean/data_summary$std
# x <- subset(data_summary,data_summary$LocationNumber == 11280)
# 
# write.table(x, file = "sample2.csv", 
#             sep = ",", 
#             col.names = NA,
#             qmethod = "double")

Histogram for January 2018 - All Locations in this Sample, 40,135,207 intervals

Data is left shifted, and we have a long right tail. Data is skewed by a few meters.
Data has locations that is not represntative of the entire dataset.

Data is not normally distributed.

#h1 <- 
#  ggplot(data,aes(x = data$Readvalue)) +
#  geom_histogram(binwidth=0.05, color = 'black', fill = '#333333') +
#  scale_x_continuous(limits=c(0,50)) + 
#  ggtitle("Histogram of kWh (bin=0.05)")
#h2 <- h1 + scale_x_log10()
#h3 <- h1 + scale_x_sqrt()
#h4 <- h1 + coord_trans('sqrt')

Location 4191826

A review of the read values shows us a left skewed distibution, and too much noise in the scatter plots.

A log10 scale for readvalue helps to normally distribute the data set. Which allows us to see the distribution much better. This will help us to get closer to a more linear model.

Location 70220

Location 1751219

Location 9142030

Location 9030286

## Warning in self$trans$transform(x): NaNs produced
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 907 rows containing non-finite values (stat_bin).
## Warning in self$trans$transform(x): NaNs produced
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 910 rows containing non-finite values (stat_bin).

Scaled Only so we can visually compare

Compare 4191826 and 1751219 by Day

We saw above the locations for 4191826 and 1751219 have similar mean load shapes. Let’s see if they have similar day to day load shapes for the read values.

Scaled Plot 4191826 and 1751219

locations <- c( 4191826,1751219 )
df <- subset(data, data$LocationNumber %in% locations)

ggplot(df,aes(x=hhmm, y=Readvalue, group = LocationNumber, color = LocationNumber)) +
    geom_line() +
    facet_wrap(h.ReadDate ~ weekday, ncol=7) +
    xlab("Time") +
    ylab("scaled kWh (ReadValue/max(ReadValue") 

fxn_scaled_plot_facet_wrap(locations)

Raw Interval Plot 4191826 and 1751219

ggplot(df,aes(x=hhmm, y=Readvalue, group=LocationNumber, color = LocationNumber)) +
    geom_line() +
    facet_wrap(h.ReadDate ~ weekday, ncol=7) +
    xlab("Time") +
    ylab("kWh") +

rm(df)

fxn_scaled_plot_facet_wrap(locations)

Scaled Interval Plot 4191826 and 4066442

locations <- c(4191826,4066442)
df <- subset(data, data$LocationNumber %in% locations)

ggplot(df,aes(x=hhmm, y=scale_by_monthmax, group=LocationNumber, color = LocationNumber)) +
    geom_line() +
    facet_wrap(h.ReadDate ~ weekday, ncol=7) +
    xlab("Time") +
    ylab("scaled kWh (ReadValue/max(ReadValue)")  +
    ggtitle(paste("Load Shapes | January 2018 | (", paste(c("Locations: ", locations), collapse=" "), ")"))

fxn_scaled_plot_facet_wrap(locations)

Best Fit - Raw Interval Plot 70220 and 535450 - Best Fit

locations <- c(70220,535450)
df <- subset(data, data$LocationNumber %in% locations)

ggplot(df,aes(x=hhmm, y=scale_by_monthmax, group=LocationNumber, color = LocationNumber)) +
    geom_line() +
    facet_wrap(h.ReadDate ~ weekday, ncol=7) +
    xlab("Time") +
    ylab("scaled kWh (ReadValue/max(ReadValue")  +
    ggtitle(paste("Load Shapes | January 2018 | (", paste(c("Locations: ", locations), collapse=" "), ")"))

fxn_scaled_plot_facet_wrap(locations)

Worst Fit - Raw Interval Plot 1751219 and 447770 - Worst Fit

locations <- c(1751219,447770)
df <- subset(data, data$LocationNumber %in% locations)

ggplot(df,aes(x=hhmm, y=scale_by_monthmax, group=LocationNumber, color = LocationNumber)) +
    geom_line() +
    facet_wrap(h.ReadDate ~ weekday, ncol=7) +
    xlab("Time") +
    ylab("scaled kWh (ReadValue/max(ReadValue")  +
    ggtitle(paste("Load Shapes | January 2018 | (", paste(c("Locations: ", locations), collapse=" "), ")"))

Worst fit because we used mean isntead of scaled mean to calculate the sum of the percent diff for all intervals. A scaled mean would be less of an issue based on the plots below.

fxn_scaled_plot_facet_wrap(locations)

Building a Linear Model

We would like to build a linear model for kWh.

# library(memisc)
# 
# locations <- c(4191826)
# df <- subset(data, data$LocationNumber %in% locations)
# 
# #Subtation
# #isWeekDay
# #weekday
# 
# 
# m1 <- lm(Readvalue ~ hhmm + weekday,data = df)
# m2 <- lm(Readvalue ~ hhmm + isWeekDay + weekday,data = df)
# #m2 <- update(m1 ~ . + isweekDay)
# #m2 <- update(m1 ~ . + weekday)
# #m3 <- update(m2 ~ . + SubstationName)
# mtable(m1)
# 
# 
# 
# layout(matrix(c(1,2,3,4),2,2))
# plot(m1)
# 
# 
# anova(m1, m2)

Model Predictions

# 
# thisIntervalkWh <- data.frame(hhmm = '12:00', isWeekDay = 'Weekday', weekday = 'Mon' )
# modelEstimate <- predict(m2, newdata = thisIntervalkWh, interval='prediction', level=0.95)
# 
# head(modelEstimate)
# 
# exp(modelEstimate)

Normalizate the Data and review again

Normailizing the data so all load profiles are simular.

We do this by taking the max value and dividing all values by the max. This allows us to compare all locations (timeseries objects) without focusing on the total consumption.

TBD

Questions

  1. How often are we expected to run this analysis?

What we need is something like this:

LocationNumber : {scaled_value1, …. scaled_value2} - Possible additional factors, Sub, RateCode, Weekday, hhmm, GIS Location???

Notes

White-paper

Predicting Consumer Load Profiles Using Commercial and Open Data Dauwe Vercamer, Bram Steurtewagen, Dirk Van den Poel, Senior Member, IEEE, and Frank Vermeulen

This paper addresses the issue of assigning new customers, for whom no AMI readings are available, to one of these load profiles. This post-clustering phase has received little attention in the past

Step: Identify and remove outliers from eaplot_by_interval_by_day_4191826ch location. Identify and check on 0 values, and remove. Identify and correct or remove duplicate intervals… identify and remove locations with not enough data.

Example data found - 33 days in the month (Whats the cause, how to clean) - 1 day avaiable in the month (whats the cause, how to clean or remove)

Quick Goal: Focus on perfect data for the first run. Clean up will be a larger effort after the model is created. So… 31 days of data, for each hour of every single day. Should be 31*96 = 2976 intervals per meter/location.

Long Term Goal: - Remove outliers (I’m seeing one on the first meter already)

Step 1: Identify the load profiles. check for outliers in the first metering point location. See if the chosen load profile is accurate enough to generate a model.

LP can be Daily Weekly Monthly

Question: how to compare the load shapes??????? Maybe - an average daily 15 minute pattern (96 measurements per customer). Average over what time frame, week, or month? Season? Location -> pivoted data… or columar?

Location, ReadValue1, …. ReadValue96

or

Question: Also, how many unique / relative profiles exist in the Duck River kWh data? Question: How to handled weekdays and weekends? Include or Exclude from the means? Question: Do we want a profile for day or profile for weekdays and weekends?

Question: Can we use person’s to determine if locations are correlted? Is this even correct to do here? See Lesson4_student.html examples

Rate Codes: Do we have more unique profiles which require more rate codes, or do the rate codes we have, for each location exhibit the same load profile shapes for their current assigned rate code?

Attributes? What are the attributes that make up a load profile for Duck River’s data? Rate Code? Meter Type? Weather?


Steps - Identify Outliers - Remove Outliers - Aggregate to daily patterns / hourly partners - Filter time series to remove missing values - Normalize Time Series - remove estimates… etc…

Additional Steps Select a time series to be clustered perform spectral clustering perform k-means calculate davis-bouldin index calculate dunn index best rank of dun and d-b

Example Models or Load Shapes

Rate 22 All electric, 400A service, residential account Location 4191826, meter is 300071 and Account # 303606-002 Location 70220, meter is 303099 and account is 324749-001

All electric 200A service Location 1751219 Location 1522548 (has a scope and other out buildings) Account #:

All electric 600A service and geothermal HVAC Location 226760 -

Electric and gas or wood head 200A service Location 1630115 Location 1530057

Find all the accounts that have similar load shape.

Refereneces

kernlab package for clustering ada and Random Forest for classification https://rpubs.com/FelipeRego/K-Means-Clustering https://www.r-bloggers.com/clustering-mixed-data-types-in-r/ https://shiny.rstudio.com/gallery/kmeans-example.html https://towardsdatascience.com/how-to-cluster-your-customer-data-with-r-code-examples-6c7e4aa6c5b1 https://uc-r.github.io/kmeans_clustering https://robjhyndman.com/TSDL/

http://readr.tidyverse.org/reference/read_delim.html

https://escience.rpi.edu/data/DA/svmbasic_notes.pdf

https://cran.r-project.org/web/packages/kernlab/kernlab.pdf

https://stats.stackexchange.com/questions/142400/quantifying-similarity-between-two-data-sets

https://www.rdocumentation.org/packages/SimilarityMeasures/versions/1.4/topics/LCSS